******************************************************************************************************
*	PUBLIC DISCLOSURE 
*	Author: 		Obeid Rehman
*	Last Edited:	25th June 2018
*	This program generates name uniqeness measures for the full tax disclosure data 
*
******************************************************************************************************

clear 	all
set 	more off
cap 	cd "C:\Users\Obeid\Dropbox\Public Disclosure\Empirics\Build\Output"


// 1. Appending All Disclosure Data
use 	Disc_All_12.dta 
append  using Disc_All_13.dta
append  using Disc_All_14.dta
append  using Disc_All_15.dta
count							// 3,651,397


// 2. Cleaning Names
keep 	if type == 3			// Keeping only Individuals
contract name, freq(name_count) // Collapsing by name and generating a count for each name

gen     name_clean =  upper(name)
replace name_clean = subinstr(name_clean,"-"," ",.)
replace	name_clean = subinstr(name_clean,"."," ",.)
replace	name_clean = subinstr(name_clean,","," ",.)
replace	name_clean = subinstr(name_clean,"#"," ",.)
replace	name_clean = subinstr(name_clean,"'"," ",.)
replace	name_clean = subinstr(name_clean,"*"," ",.)
replace	name_clean = subinstr(name_clean,"?"," ",.)
replace	name_clean = subinstr(name_clean,"="," ",.)
replace	name_clean = subinstr(name_clean,";"," ",.)
replace	name_clean = subinstr(name_clean,":"," ",.)
replace	name_clean = subinstr(name_clean,"/"," ",.)
replace	name_clean = subinstr(name_clean,"+"," ",.)
replace	name_clean = subinstr(name_clean,"("," ",.)
replace	name_clean = subinstr(name_clean,")"," ",.)
replace	name_clean = subinstr(name_clean,"@"," ",.)
replace	name_clean = subinstr(name_clean,"_"," ",.)
replace	name_clean = subinstr(name_clean,"\"," ",.)
replace	name_clean = subinstr(name_clean,"["," ",.)
replace	name_clean = subinstr(name_clean,"]"," ",.)
replace	name_clean = subinstr(name_clean,"`"," ",.)
replace	name_clean = subinstr(name_clean,"<"," ",.)
replace	name_clean = subinstr(name_clean,"1"," ",.)
replace	name_clean = subinstr(name_clean,"2"," ",.)
replace	name_clean = subinstr(name_clean,"3"," ",.)
replace	name_clean = subinstr(name_clean,"4"," ",.)
replace	name_clean = subinstr(name_clean,"5"," ",.)
replace	name_clean = subinstr(name_clean,"6"," ",.)
replace	name_clean = subinstr(name_clean,"7"," ",.)
replace	name_clean = subinstr(name_clean,"8"," ",.)
replace	name_clean = subinstr(name_clean,"9"," ",.)
replace	name_clean = subinstr(name_clean,"0"," ",.)

replace	name_clean = strtrim(name_clean)
replace	name_clean = stritrim(name_clean)
replace name_clean = subinstr(name_clean,"MR ","",1) if strpos( name_clean,"MR ") ==1   //Removing names starting with MR
replace name_clean = subinstr(name_clean,"MRS ","",1) if strpos( name_clean,"MRS ") ==1 //Removing names starting with MRS
replace name_clean = subinstr(name_clean,"MS ","",1) if strpos( name_clean,"MS ") ==1   //Removing names starting with MS
replace name_clean = subinstr(name_clean,"MST ","",1) if strpos( name_clean,"MST ") ==1 //Removing names starting with MST


bysort  	name_clean: egen name_count_v1 =sum (name_count) // Count for each cleaned name
split		name_clean
rename 		name_clean name_clean_v1

/*Generating list of Unique Names to check spellings
preserve	
	reshape 	long name_clean, i(name_full) j(j)
	drop 		name_full j name_coun
	contract	name_clean
	export 		delimited using "C:\Users\Obeid\Dropbox\Public Disclosure\Empirics\Build\Input\name_list.csv", replace
restore
*/

// 3. Importing Name Corrections
preserve
	import 	delimited "C:\Users\Obeid\Dropbox\Public Disclosure\Empirics\Build\Input\name_list_corrections.csv", clear 
	drop 	if correct == ""
	drop	if name_clean == ""
	drop 	_freq
	sort	 name_clean
	tempfile corrections
	sa 		`corrections'
restore

// 4.0 Standardzing Names 
foreach n of numlist 1/16{ 
	rename	name_clean`n' name_clean
	sort 	name_clean
	merge 	name_clean using `corrections'
	replace name_clean = correct if _merge == 3
	drop 	if _merge == 2
	drop 	_merge
	drop 	correct
	rename  name_clean name_clean`n'
}

egen    name_corr = concat( name_clean1- name_clean16), punct(" ")
drop	name_clean1- name_clean16
rename  name_corr name_clean

// 4.1 Repeating Process to Catch any Mistakes

split   name_clean
drop    name_clean

foreach n of numlist 1/16{ 
	rename	name_clean`n' name_clean
	sort 	name_clean
	merge 	name_clean using `corrections'
	replace name_clean = correct if _merge == 3
	drop 	if _merge == 2
	drop 	_merge
	drop 	correct
	rename  name_clean name_clean`n'
}

egen    name_clean_v2 = concat( name_clean1- name_clean16), punct(" ")
drop	name_clean1- name_clean16

// 5. Final Cleaning
replace name_clean_v2 = subinstr(name_clean_v2,"ABD UL ","ABDUL",.)
replace name_clean_v2 = subinstr(name_clean_v2,"ABD UR ","ABDUR",.)
replace name_clean_v2 = subinstr(name_clean_v2,"ABD US ","ABDUS",.)

bysort  name_clean_v2: egen name_count_v2 =sum(name_count)


// 6. Output
save 	"C:\Users\Obeid\Dropbox\Public Disclosure\Empirics\Build\Output\Name_counts.dta", replace


/*
preserve
	duplicates drop name_clean_v2, force
	summ name_count_v2,d
restore

* Merging back into disclosure data

use 	Disc_All_12.dta 
append  using Disc_All_13.dta
append  using Disc_All_14.dta
append  using Disc_All_15.dta

keep if type == 3
count				//3,651,393

merge m:1 name using "C:\Users\Obeid\Dropbox\Public Disclosure\Empirics\Build\Output\Name_counts.dta"
